#!/usr/bin/env perl
# https://github.com/shenwei356/bio_scripts

use strict;
use Getopt::Long;
use BioUtil::Seq;
use BioUtil::Util;

my $usage = q(
fasta2tab - transform the fasta fromat to two-column table

Usage: fasta2tab [options] [fastafiles...]
Options:
    -r,   --reverse             Reverse sequence
    -c,   --complement          Complement sequence
    -rc,  --reversecomplement   Reversecomplement
    -sub, --subseq  INT,INT     Substring of sequence, 1-based
                                Examples:
                                    seq         ACGAGACGTA
                                    index       1234567890

                                     option      subseq
                                    --------------------
                                    -sub  2,7    CGAGAC
                                    -sub  2,2    C
                                    -sub   ,7   ACGAGAC
                                    -sub  2,     CGAGACGTA
                                    -sub -3,           GTA
                                    -sub -3,-2         GT
                                    -sub   ,-3  ACGAGACG

    -t,   --trim                Trim non-Latin alphabet
    -lc,  --lowercase           Lowercase
    -uc,  --uppercase           Uppercase

    -l,   --length              Ouput sequence length at another column
    -l2,  --length2             Ouput number of latin-letter in sequence
                                at another column
    -bc,  --bc STRING[,STRING]  Ouput base content
                                Examples:
                                    'GC' :  G+C content,
                                    'G,C':  G and C, in two column
    -gc,  --gc                  Ouput GC content at another column

    -h,   --help                Show this help information

Examples:

    1. sort fasta by sequnece length
       cat seq.fa | fasta2tab -t -l | sort -r -t"`echo -e '\t'`" -n -k3,3 \
         |  tab2fasta -l 70 > seq.sorted.fa

    2. extract sub sequence
       fasta2tab -t -sub 3,10 -rc  seq.fa | tab2fasta

    3. extract sequence longer than 1000 bp
       cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70

    4. extract aligned sequence of which the original sequence is longer than 1000 bp
       cat seq.fa | fasta2tab   -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70

    5. reverse complement sequence, uppercase, and trim gaps
       zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta

This script is usually used in pair with tab2fasta.
https://github.com/shenwei356/bio_scripts

);

my $para = {};
GetOptions(
    'help|h' => \$$para{help},

    'reverse|r'            => \$$para{rev},
    'complement|c'         => \$$para{comp},
    'reversecomplement|rc' => \$$para{rc},
    'subseq|sub=s'         => \$$para{sub},

    'trim|t'       => \$$para{trim},
    'lowercase|lc' => \$$para{lc},
    'uppercase|uc' => \$$para{uc},

    'length|l'   => \$$para{len},
    'length2|l2' => \$$para{len2},
    'bc=s'       => \$$para{bc},
    'gc'         => \$$para{gc},
) or die $usage;

die $usage if $$para{help};
if ( $$para{sub} ) {
    die qq(
parameter of -sub not correct.

examples:
    seq         ACGAGACGTA
    index       1234567890

     option      subseq
    --------------------
    -sub  2,7    CGAGAC
    -sub  2,2    C
    -sub   ,7   ACGAGAC
    -sub  2,     CGAGACGTA
    -sub -3,           GTA
    -sub -3,-2         GT
    -sub   ,-3  ACGAGACG

) unless $$para{sub} =~ /^(-?\d*),(-?\d*)$/;
    die "warning: end ($2) should be >= start ($1)\n" if $2 ne '' and $1 ne '' and $2 < $1 ;
}

my @files = file_list_from_argv(@ARGV);

for my $file (@files) {
    my $next_seq = FastaReader($file);
    while ( my $fa = &$next_seq() ) {
        my ( $header, $seq ) = @$fa;

        $header =~ s/\t/__tab__/g;

        if ( $$para{trim} ) {
            $seq =~ s/[^a-zA-Z]+//g;
        }

        if ( $$para{sub} ) {
            my ( $start, $end ) = split /,/, $$para{sub};
            if ( $start eq '' ) {
                $start = 1;
            }
            elsif ( $start < 0 ) {
                $start += 1;
            }

            if ( $end eq '' ) {
                $end = 1 + length $seq;
            }
            elsif ( $end < 0 ) {
                $end += 1;
            }
            $seq = substr $seq, $start - 1, $end - $start + 1;
        }

        if ( $$para{rc} ) {
            $seq = revcom($seq);
        }
        else {
            $seq = complement($seq) if $$para{comp};
            $seq = reverse $seq     if $$para{rev};
        }

        if ( $$para{lc} ) {
            $seq = lc $seq;
        }
        elsif ( $$para{uc} ) {
            $seq = uc $seq;
        }

        print "$header\t$seq";
        print "\t", length $seq if $$para{len};
        if ( $$para{len2} ) {
            if ( $$para{trim} ) {
                print "\t", length $seq;
            }
            else {
                my $seq2 = $seq;
                $seq2 =~ s/[^a-zA-Z]+//g;
                print "\t", length $seq2;
            }
        }

        if ($$para{gc}) {
            print "\t", base_content( 'gc', $seq );
        } elsif ($$para{bc}) {
            my @bases = split /,/, $$para{bc};
            for my $base (@bases) {
                print "\t", base_content( $base, $seq );
            }
        }
        print "\n";
    }
}
